import urllib2,random
from bs4 import BeautifulSoup
class spider(object):

    def on_get_link(self,data,field,href):
        soup=BeautifulSoup(data,'html5lib',from_encoding="gb18030")
        urls=[]
        for f in soup.find_all(field):
            url=f.get(href,'').strip()
            if url:
                urls.append(url)
        return urls

    def get_link(self,data):
        urls=self.on_get_link(data,'a','href')
        css_urls=self.on_get_link(data,'link','href')
        js_urls=self.on_get_link(data,'script','src') 

        return urls,css_urls,js_urls



if __name__=='__main__':
    url='http://topic.csdn.net/u/20120724/17/cb5c4c51-4cdb-45e9-a11a-c99de1c4fa38.html?99676'
    #url='http://www.baidu.com/'
    user_agents = [
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
                'Opera/9.25 (Windows NT 5.1; U; en)',
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
                'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
                ]
    headers={
            'User-Agent':user_agents[random.randint(0,len(user_agents)-1)],
            'Referer':'http://www.csdn.net/'
            }
    req=urllib2.Request(url=url,headers=headers)
    content=urllib2.urlopen(req).read().decode('utf-8')
    print type(content)
    #print content
    s=spider()
    t1=s.get_link(content)
    for i in t1:
        for n in i:
            print n
